import os
from glob import glob
import pandas as pd
from datetime import datetime
from dateutil.parser import parse


project_dir = os.path.dirname(os.path.abspath(__file__))
repository_dir = "clonned_repos"
output_dir = "results"
deleted_files = ['BUILD.bazel']

def find_commits(github_repo_path: str) -> tuple():
    """ 
    find the first added and last removed commits 
    of the specified deleted files
    """
    user, repo = github_repo_path.split("/")[-2], github_repo_path.split("/")[-1]
    added_path = os.path.join(project_dir, output_dir, "texts", user, repo, "added")
    removed_path = os.path.join(project_dir, output_dir, "texts", user, repo, "removed")
    
    try: 
        os.makedirs(os.path.join(project_dir, output_dir, "texts", github_repo_path.split("/")[-2], github_repo_path.split("/")[-1]))
    except:
        pass
    
    os.chdir(github_repo_path)
    for deleted_file in deleted_files:
        os.system(f'git log --diff-filter=D --  "{deleted_file}" > "{removed_path}_{deleted_file}.txt"')
        os.system(f'git log --diff-filter=A --  "{deleted_file}" > "{added_path}_{deleted_file}.txt"')
    
    commits, authors, dates, urls, statuses, filenames = [], [], [], [], [], []
    for deleted_file in deleted_files:
        try:
            with open(added_path+"_"+deleted_file+".txt", "r") as f:
                for line in f.readlines():
                    if line.startswith("commit"):
                        commit_hash = ""
                        if line.split(" ", 1)[1]:
                            commit_hash = line.split(" ", 1)[1].strip()
                        commits.append(commit_hash)
                        urls.append(f"https://github.com/{user}/{repo}/commit/{commit_hash}")
                        statuses.append("added")
                        filenames.append(deleted_file)
                    elif line.startswith("Author:"):
                        authors.append(line.split(" ", 1)[1])
                    elif line.startswith("Date:"):
                        dates.append(datetime.strftime(parse(line.split(" ", 1)[1]).date(), "%Y-%m-%d"))
        except:
            pass
        try:
            with open(removed_path+"_"+deleted_file+".txt", "r") as f:
                for line in f.readlines():
                    if line.startswith("commit"):
                        commit_hash = ""
                        if line.split(" ", 1)[1]:
                            commit_hash = line.split(" ", 1)[1].strip()
                        commits.append(commit_hash)
                        urls.append(f"https://github.com/{user}/{repo}/commit/{commit_hash}")
                        statuses.append("removed")
                        filenames.append(deleted_file)
                    elif line.startswith("Author:"):
                        authors.append(line.split(" ", 1)[1])
                    elif line.startswith("Date:"):
                        dates.append(datetime.strftime(parse(line.split(" ", 1)[1]).date(), "%Y-%m-%d"))
        except:
            pass
        

    return zip(commits, authors, dates, urls, statuses, filenames)
data = []

for f in glob(os.path.join(project_dir, repository_dir, "*", "*")):
    print("processing:", f)
    try:
        username, repo = f.split("/")[-2], f.split("/")[-1]
        for commit, author, date, url, status, filename in find_commits(f):
            info = {
                "username": username,
                "repo": repo,
                "commit": commit,
                "author": author,
                "date": date,
                "url": url,
                "status": status,
                "file": filename
            }
            data.append(info)
    except Exception as e:
        print(e)
if data:
    pd.DataFrame(data).to_csv(os.path.join(project_dir, output_dir, "abandoned.csv"), index=False)
